Evaluate the cell type labels (the “truth” for scANVI / scVI).
# edit and run this locally (laptop / etc)
~/git/scEiaD_quant/workflow/scripts/hand_change_ct_labels.R
# rsync the new cell label file to biowulf
rsync -Prav scEiaD_cell_labels_2024_08_27.csv.gz h2:/home/mcgaugheyd/git/scEiaD_quant/
# create the organism level h5ad file from the individual scEiaD_quant made h5ad files
# now on biowulf
# sinteractive --mem=128G
mamba deactivate; mamba activate rscvi
python ~/git/scEiaD_modeling/workflow/scripts/merge_adata.py hs111.fin.txt /home/mcgaugheyd/git/scEiaD_quant/sample_meta.scEiaD_v1.2024_02_28.01.tsv.gz /home/mcgaugheyd/git/scEiaD_quant/scEiaD_cell_labels_2024_08_27.csv.gz hs111.adata.solo.20240827.h5ad hs111.adata.solo.20240827.obs.csv.gz
# back to local computer to copy the cell metadata
cd ~/git/scEiaD_modeling
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/hs111.adata.solo.20240827.obs.csv.gz .
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 2: /Users/mcgaugheyd/git/scEiaD_quant/workflow/scripts/hand_change_ct_labels.R: Permission denied
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [sender]
rsync error: error in rsync protocol data stream (code 12) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [sender=2.6.9]
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 8: mamba: command not found
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 8: mamba: command not found
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8120e4ea5.txt: line 9: python: command not found
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
library(tidyverse)
sample_meta <- data.table::fread('~/git/scEiaD_quant/sample_meta.scEiaD_v1.2024_02_28.01.tsv.gz')
cell_meta <- data.table::fread('~/data/scEiaD_modeling/hs111.adata.solo.20240827.obs.csv.gz')[,-1] %>%
relocate(barcode) %>%
filter(solo_doublet == "FALSE")
hs111_eye <- cell_meta %>%
filter(study_accession != 'SRP362101') %>%
mutate(stage = case_when(as.numeric(age) <= 10 ~ 'Developing',
TRUE ~ 'Mature'),
side = case_when(tissue == 'Brain Choroid Plexus' ~ 'Brain Choroid Plexus',
grepl("Choroid|RPE", tissue) ~ 'eye',
grepl("Retina", tissue) ~ 'eye',
grepl("Outf", tissue) ~ 'FrontEye',
grepl("Iris", tissue) ~ 'FrontEye',
grepl("Sclera", tissue) ~ 'FrontEye',
grepl("Cornea", tissue) ~ 'FrontEye',
grepl("Macula", tissue) ~ 'eye',
grepl("Trabecul", tissue) ~ 'FrontEye',
grepl("Optic", tissue) ~ 'eye',
TRUE ~ tissue)) %>%
filter(organ == 'Eye', # 2024 09 03 oops
organism == 'Homo sapiens',
!grepl("^#", sample_accession),
source == 'Tissue',
#tissue %in% c("Macula", "Retina"),
#side %in% c("FrontEye", "eye"),
#side %in% c("eye"),# 2024 08 31
#capture_type == 'cell', # 2024 08 28
#kb_tech %in% c("10xv1","10xv2","10xv3"), # 2024 08 28
stage == 'Mature')# %>%
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `stage = case_when(as.numeric(age) <= 10 ~ "Developing", TRUE ~ "Mature")`.
Caused by warning:
! NAs introduced by coercion
#filter(study_accession != 'SRP447468') # 2024 08 29 remove (for now????) this optic nerve and adjacent study from sanes as it clusters (hclust) apart from everything else
# do not use studies with less than 1000 cells as ref
srp362101_sra <- read_csv('~/git/scEiaD_quant/data/SRP362101_SraRunTable.txt')
Rows: 24 Columns: 32── Column specification ──────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (24): Run, Assay Type, BIOMATERIAL_PROVIDER, BioProject, BioSample, BioSampleModel, C...
dbl (6): AGE, AvgSpotLen, Bases, Bytes, version, Sample Name
dttm (2): ReleaseDate, create_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hs111_eye <- bind_rows(hs111_eye,
cell_meta %>% filter(sample_accession %in% srp362101_sra$Experiment)) # 2024 09 11
# do not use studies with less than 1000 cells as ref
hs111_study <- hs111_eye %>%
group_by(study_accession) %>% summarise(Count = n()) %>% filter(Count>1000)
set.seed(101294)
hs111_eye$MajorCellType %>% table()
.
amacrine ape astrocyte
433712 65766 8050 10726
b beam bipolar choriocapillaris
1932 2694 39673 748
ciliary body cone cornea corneal progenitor
1113 6552 169 2
endothelial epithelial equatorial fiber
13436 39731 2905 1929
fibroblast goblet horizontal immune
38626 251 6965 4462
jct limbal lymphocyte macrophage
462 20 453 4950
mast melanocyte microglia monocyte
136 19936 963 456
mueller muscle npce oligo
26975 15450 4596 3051
opc pce pericyte ppe
1814 13395 2710 14010
red blood retinal ganglion rod rod bipolar
190 18380 98029 3959
rpc rpe schwann smooth muscle
23 1566 8192 2551
sphincter t/nk
1053 3217
hs111_ref_bcs <- hs111_eye %>%
filter(study_accession %in% hs111_study$study_accession) %>%
filter(MajorCellType != '', # 2024 04 16 big change - only use labelled cells in ref
MajorCellType != 'rpc') %>% # 2024 08 30 23 cells in 83 year old labelled as rpc???
# 2024 07 30 fixed mistake in filtering that still let them in
group_by(study_accession, MajorCellType) %>% # 2024 07 30 big change - sample on study *and* majorcelltype
sample_n(2000, replace = TRUE) %>%
unique()
hs111_query_bcs <- hs111_eye %>%
filter(!barcode %in% hs111_ref_bcs$barcode)
#hs111_ref_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/hs111_mature_eye_ref_bcs.full.20240911-01.csv.gz'))
#hs111_query_bcs$barcode %>% write(gzfile('~/git/scEiaD_modeling/data/hs111_mature_eye_query_bcs.full.20240911-01.csv.gz'))
now go to biowulf2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye
# will take ~6-10 hours (depending on hpc usage)
mamba deactivate; mamba activate; bash ~/git/scEiaD_modeling/Snakemake.wrapper.sh ~/git/scEiaD_modeling/workflow/Snakefile ~/git/scEiaD_modeling/config/config_hs111_mature_eye_full.yaml ~/git/scEiaD_modeling/config/cluster.json
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8544c965f.txt: line 2: mamba: command not found
/var/folders/s4/y5f1tt296dj8088gvczcx11d4lrnr7/T/RtmpZmrNsy/chunk-code-15ae8544c965f.txt: line 2: mamba: command not found
/Users/mcgaugheyd/git/scEiaD_modeling/Snakemake.wrapper.sh: line 16: snakemake: command not found
cd /Users/mcgaugheyd/data/scEiaD_modeling/hs111_mature_eye
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*scib* .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*obs* .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*.leiden3.csv.gz .
rsync -Prav h2:/data/OGVFB_BG/scEiaD/2024_02_28/snakeout/hs111_mature_eye_full/*hvg.csv.gz .
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
kex_exchange_identification: read: Connection reset by peer
Connection reset by 128.231.2.3 port 22
rsync: connection unexpectedly closed (0 bytes received so far) [receiver]
rsync error: unexplained error (code 255) at /AppleInternal/Library/BuildRoots/289ffcb4-455d-11ef-953d-e2437461156c/Library/Caches/com.apple.xbs/Sources/rsync/rsync/io.c(453) [receiver=2.6.9]
scib_files <- list.files(path = '~/data/scEiaD_modeling/hs111_mature_eye_full/', full.names = TRUE) %>% grep('scib',.,value = TRUE)
scib_scores <- purrr::map(scib_files, read.csv)
names(scib_scores) <- scib_files %>% str_extract("\\d\\d\\d\\dhvg.*l") %>% gsub("^_","",.)
for (i in names(scib_scores)){
scib_scores[[i]]$Delta <- as.numeric(scib_scores[[i]][2,'Total']) - as.numeric(scib_scores[[i]][1,'Total'])
}
bind_rows(scib_scores, .id = 'Params') %>%
filter(Embedding == 'X_scVI') %>%
mutate(Params= gsub("hvg|e|l","",Params)) %>%
separate(Params, into = c("HVG","Epochs", "Latent Dimensions"), sep = "_") %>%
mutate_all(as.numeric) %>%
mutate(Embedding = 'X_scVI') %>%
arrange(-Total)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `Embedding = .Primitive("as.double")(Embedding)`.
Caused by warning:
! NAs introduced by coercion
library(tidyverse)
source('analysis_scripts.R')
obs <- pull_obs('~/data/scEiaD_modeling/hs111_mature_eye_full//hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.obs.csv.gz')
`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.
swarooks_markers <- read_csv('../data/MBrooks313_scRNAseq_Retina_Cell_type_MB_v2_positive.csv')
Rows: 257 Columns: 3── Column specification ──────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): cell_type, sub_cell_type, gene
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
diff <- pull_diff("~/data/scEiaD_modeling/hs111_mature_eye_full/hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.difftesting.leiden3.csv.gz")
'select()' returned 1:many mapping between keys and columns
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
obs$labels <- obs$labels %>%
left_join(diff$top_diff)
Joining with `by = join_by(leiden3)`
obs$labels %>%
arrange(aMCT) %>%
mutate(leiden3 = as.factor(leiden3)) %>%
DT::datatable(filter = 'top')
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Labelled points are clusters with more than one machine CT (above 0.1)
obs$labels %>%
mutate(mMCT_unique_count = str_count(mMCT, ',') + 1) %>%
ggplot(aes(x=solo_score,y=background_fraction)) +
geom_point(aes(color = as.factor(mMCT_unique_count), size = log1p(TotalCount))) +
ggrepel::geom_label_repel(data = . %>% filter(mMCT_unique_count > 1), aes(label = leiden3), max.overlaps = Inf)
Between author cell type and machine (scANVI) cell type
obs$labels %>% filter(aMCT != mMCT) %>% data.frame
Multiple mMCT above 10% of the total in a cluster
obs$labels %>% filter(grepl(",",mMCT)) %>% select(-solo_score, -background_fraction, -cell_probability) %>% data.frame %>% arrange(-TotalCount)
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = MCT_scANVI), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(MCT_scANVI) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = MCT_scANVI, color = MCT_scANVI)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none")
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = mCT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = mCT, color = mCT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none")
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = mMCT), pointsize = 4, alpha = 0.5) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
facet_wrap(~study_accession)
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = capture_type)) +
facet_wrap(~MCT_scANVI) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot()
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>% filter(grepl("bipolar", mMCT)) %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = leiden3)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
ggtitle("bipolar") +
cowplot::theme_cowplot()
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'rpe') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = mCT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot()
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'amacrine') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = study_accession), pointsize = 0.8, alpha = 0.8) +
ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = leiden3)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot()+
ggtitle("amacrine")
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>% filter(mMCT == 'amacrine') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
ggrepel::geom_label_repel(data = . %>% group_by(mCT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = mCT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot()
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>% filter(mCT == 'mueller') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = leiden3)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + ggtitle("mueller")
obs$obs %>%
left_join(obs$labels, by = 'leiden3') %>% filter(mCT == 'rod') %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = as.factor(leiden3)), pointsize = 0.8, alpha = 0.8) +
ggrepel::geom_label_repel(data = . %>% group_by(leiden3) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = leiden3)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + ggtitle("rod")
Take pseudobulk values (at the cluster level) and hierarchically cluster them to ensure there aren’t any issues in either the overall structure (e.g. rod and cones are intersperse)d and/or to identify any potential mislabeled clusters
library(tidyverse)
pb <- data.table::fread('~/data/scEiaD_modeling/hs111_mature_eye_full/hs111_mature_eye_20240911_withSUSPECTstudies2000hvg_200e_30l.pseudoBulk.leiden3.csv.gz')
colnames(pb) <- gsub("\\.\\d+","",colnames(pb))
hvg <- data.table::fread('~/data/scEiaD_modeling/hs111_mature_eye_full/hvg2000.csv.gz')[-1,]
rnames <- pb$V1
clust <- str_extract(rnames, '\\d+') %>% as.integer()
pb <- pb[,-1] %>% as.matrix()
row.names(pb) <- as.character(clust)
pb <- pb[as.character(obs$labels$leiden3),]
# library(SingleCellExperiment)
# library(scater)
# library(scran)
# do_hvg <- function(cts){
# sce <- SingleCellExperiment(list(counts = cts))
# sce <- logNormCounts(sce)
# hvg <- modelGeneVar(sce)
# # Visualizing the fit:
# hvg$var <- metadata(hvg)$var
# return(hvg)
# }
#
# nhvg <- do_hvg(t(pb))
pb_norm <- metamoRph::normalize_data(t(pb), sample_scale = 'cpm') %>% t()
Sample CPM scaling
log1p scaling
# remove cell cycle genes
conv_table <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
keys=gsub('\\.\\d+','',unique(colnames(pb_norm))),
columns=c("ENSEMBL","SYMBOL", "MAP","GENENAME", "ENTREZID"), keytype="ENSEMBL")
'select()' returned 1:many mapping between keys and columns
cc_genes <- hvg %>% mutate(ENSEMBL = gsub("\\.\\d+","",V2)) %>%
left_join(conv_table, by = "ENSEMBL") %>%
mutate(cc_genes = case_when(SYMBOL %in% (Seurat::cc.genes.updated.2019 %>% unlist()) ~ TRUE)) %>%
filter(cc_genes) %>% pull(V2)
ribo_genes <- hvg %>% mutate(ENSEMBL = gsub("\\.\\d+","",V2)) %>%
left_join(conv_table, by = "ENSEMBL") %>% filter(grepl("^RPL|^RPS|^MT",SYMBOL)) %>%
pull(SYMBOL)
pb_norm <- pb_norm[,hvg$V2]
#pb_norm <- pb_norm[,hvg$V2[!hvg$V2 %in% c(cc_genes,ribo_genes)]]
# https://stats.stackexchange.com/questions/31565/compute-a-cosine-dissimilarity-matrix-in-r
sim <- pb_norm / sqrt(rowSums(pb_norm * pb_norm))
sim <- sim %*% t(sim)
D_sim <- as.dist(1 - sim)
Found more than one class "dist" in cache; using the first, from namespace 'BiocGenerics'
Also defined by ‘spam’
hclust_sim <- hclust(D_sim, method = 'average')
hclust_sim$labels <- obs$labels %>% pull(leiden3)
library(ggtree)
ggtree v3.12.0 For help: https://yulab-smu.top/treedata-book/
If you use the ggtree package suite in published research, please cite the
appropriate paper(s):
Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R
package for visualization and annotation of phylogenetic trees with their
covariates and other associated data. Methods in Ecology and Evolution. 2017,
8(1):28-36. doi:10.1111/2041-210X.12628
S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L Zhan, T Wu, E
Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact visualization of richly annotated
phylogenetic data. Molecular Biology and Evolution. 2021, 38(9):4039-4042. doi:
10.1093/molbev/msab166
Guangchuang Yu. Data Integration, Manipulation and Visualization of Phylogenetic
Trees (1st edition). Chapman and Hall/CRC. 2022, doi:10.1201/9781003279242
Attaching package: ‘ggtree’
The following object is masked from ‘package:tidyr’:
expand
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels, by = c("label" = "leiden3")) %>%
mutate(techRatio = round(techRatio, digits = ))
p + layout_dendrogram() +
geom_tiplab(aes(label = paste(label, mMCT, studyCount, TotalCount, techRatio, sep = ' - '), color = mCT)) +
theme_dendrogram(plot.margin=margin(16,16,300,16)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
guides(color="none")
p <- ggtree(hclust_sim)
p$data <- p$data %>%
left_join(obs$labels %>%
mutate(studies = case_when(studyCount ==1 ~ studies,
TRUE ~ "multiple")), by = c("label" = "leiden3")) %>%
mutate(class = case_when(mCT %in% c("rod","cone","retinal ganglion",
"amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
TRUE ~ "non-neural"))
p + layout_dendrogram() +
geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = class)) +
theme_dendrogram(plot.margin=margin(16,16,300,16)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
guides(color="none")
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels %>% mutate(studies = case_when(studyCount ==1 ~ studies,
TRUE ~ "multiple")), by = c("label" = "leiden3"))
p + layout_dendrogram() +
geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = mCT)) +
geom_tippoint(aes(shape = studies), size= 3) +
theme_dendrogram(plot.margin=margin(16,16,300,16)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
guides(color="none")
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(obs$labels %>% mutate(studies = case_when(studyCount ==1 ~ studies,
TRUE ~ "multiple")), by = c("label" = "leiden3"))
p + layout_dendrogram() +
geom_tiplab(aes(label = paste(label, mMCT, studies, sep = ' - '), color = mCT)) +
geom_tippoint(aes(shape = studies), size= 3) +
theme_dendrogram(plot.margin=margin(16,16,300,16)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
guides(color="none")
NA
NA
hclust 2 and 3 are non-neural
1 is neural
cutree_with_labels <- cutree(hclust_sim, k = 3) %>% enframe(name = 'leiden3', value = 'hclust_k3') %>%
mutate(leiden3 = as.integer(leiden3)) %>%
left_join(obs$labels) %>%
mutate(class = case_when(mCT %in% c("rod","cone","retinal ganglion",
"amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
TRUE ~ "non-neural"))
Joining with `by = join_by(leiden3)`
p <- ggtree(hclust_sim)
p$data <- p$data %>% left_join(cutree_with_labels, by = c("label" = "leiden3"))
p + layout_dendrogram() +
geom_tiplab(aes(color = as.factor(hclust_k3))) +
theme_dendrogram(plot.margin=margin(16,16,300,16)) +
scale_color_manual(values = c(pals::alphabet(), pals::glasbey()) %>% unname())
cutree_with_labels %>%
group_by(hclust_k3, class) %>%
summarise(Count = n(),
leiden3 = paste(leiden3, collapse = ', '))
`summarise()` has grouped output by 'hclust_k3'. You can override using the `.groups` argument.
Reasons: - non-neural in hclust 1 - neural in hclust 2 or 3 - SRP447468 “exclusive” neural clusters (as they seem to group with themselves instead of their respective cluster types)
remove_leiden3 <- #c(71,93,126,112,144,39,21,85,135,7,74,36,124)
# reason 1
c(cutree_with_labels %>% filter(class == 'non-neural', hclust_k3 == 1) %>% pull(leiden3),
# reason 2
cutree_with_labels %>% filter(class == 'neural', hclust_k3 %in% c(2,3)) %>% pull(leiden3),
# reason 3
c(153,131,150,62,4,100)
)
Another sanity check for some cell types
labelled_diff_testing <- diff$diff_testing %>%
left_join(conv_table, by =c( "ENSEMBL")) %>% filter(SYMBOL %in% toupper(swarooks_markers$gene))
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
scored_labelled_diff_testing <- labelled_diff_testing %>%
left_join(swarooks_markers %>% mutate(gene = toupper(gene)), by = c("SYMBOL" = "gene")) %>%
group_by(base, cell_type) %>%
summarise(score = mean(scores))
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.`summarise()` has grouped output by 'base'. You can override using the `.groups` argument.
scored_labelled_diff_testing %>%
pivot_wider(names_from = cell_type, values_from = score) %>%
dplyr::rename(leiden3 = base) %>% left_join(obs$labels %>% select(leiden3, mMCT)) %>% mutate(leiden3 = as.factor(leiden3)) %>% DT::datatable(filter = 'top')
Joining with `by = join_by(leiden3)`
Run the scEiaD_modeling again on the neural and
non-neural CLUSTERS
set.seed(1364578)
neural_bcs <- obs$obs %>%
left_join(cutree_with_labels, by = 'leiden3') %>%
filter(!leiden3 %in% remove_leiden3) %>%
mutate(class = case_when(MCT_scANVI %in% c("rod","cone","retinal ganglion",
"amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
TRUE ~ "non-neural")) %>%
filter(class == 'neural')
neural_ref <- neural_bcs %>%
ungroup() %>%
group_by(study_accession, MCT_scANVI) %>%
#filter(tissue %in% c('Retina')) %>%
sample_n(2000, replace = TRUE) %>%
unique()
low_study_n <- neural_ref %>% group_by(study_accession) %>% count() %>% filter(n < 100)
neural_ref <- neural_ref %>% ungroup() %>%
filter(!study_accession %in% low_study_n$study_accession)
neural_query <- neural_bcs %>%
filter(!barcode %in% neural_ref$barcode)
#neural_ref$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_neural_ref_bcs.20240924-02.csv.gz'))
#neural_query$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_neural_query_bcs.20240924-02.csv.gz'))
set.seed(1364578)
nonneural_bcs <- obs$obs %>%
left_join(cutree_with_labels, by = 'leiden3') %>%
filter(!leiden3 %in% remove_leiden3) %>%
mutate(class = case_when(MCT_scANVI %in% c("rod","cone","retinal ganglion",
"amacrine","horizontal","rod bipolar","bipolar") ~ "neural",
TRUE ~ "non-neural")) %>%
filter(class == 'non-neural')
nonneural_ref <- nonneural_bcs %>%
ungroup() %>%
group_by(study_accession, MCT_scANVI) %>%
#filter(tissue %in% c('Retina')) %>%
sample_n(2000, replace = TRUE) %>%
unique()
low_study_n <- nonneural_ref %>% group_by(study_accession) %>% count() %>% filter(n < 100)
# remove non author labelled cells from SRP447468 as a ref
# this study keeps being an outlier - just want to use cells which passed their QC
nonneural_ref <- bind_rows(nonneural_ref %>% ungroup() %>%
filter(!study_accession %in% low_study_n$study_accession,
study_accession != 'SRP447468'),
nonneural_ref %>% ungroup() %>%
filter(study_accession == 'SRP447468',
MajorCellType != 'unlabelled')
)
nonneural_query <- nonneural_bcs %>%
filter(!barcode %in% nonneural_ref$barcode)
#nonneural_ref$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_NONneural_ref_bcs.20241001-02.csv.gz'))
#nonneural_query$barcode %>% write(gzfile('../data/hs111_mature_eye__stage4_NONneural_query_bcs.20241001-02.csv.gz'))
Load in new CT calls (made by hand in the __stage4_
docs)
load("Human_Mature_Eye_full__stage4_NONneural.obs.freeze20241112.Rdata")
load("Human_Mature_Eye_full__stage4_neural.obs.freeze20241112.Rdata")
obs$nobs <- bind_rows(obs_nonneural$nobs %>%
mutate(CT_l3= paste0(CT, ":", leiden3),
hrCT_l3 = paste0(hrCT, ":", leiden3)) %>%
select(barcode, CT, hrCT,CT_l3, hrCT_l3, suspect) %>%
mutate(origin = 'nonneural') %>%
left_join(obs$obs, by = 'barcode'),
obs_neural$nobs %>%
mutate(CT_l3= paste0(CT, ":", leiden3),
hrCT_l3 = paste0(hrCT, ":", leiden3)) %>%
select(barcode, CT, hrCT,CT_l3, hrCT_l3, suspect) %>%
mutate(origin = 'neural') %>%
left_join(obs$obs, by = 'barcode'))
obs$nlabels <- bind_rows(obs_nonneural$nlabels %>% mutate(leiden3 = paste0(leiden3, ':nn')),
obs_neural$nlabels %>% mutate(leiden3 = paste0(leiden3, ':ne'))) %>% select(-newCT) %>%
relocate(leiden3, CT, hrCT)
# label leiden3 with >90% suspect labels
sus_stage3_l3 <- obs$nobs %>%
group_by(leiden3, suspect) %>%
summarise(Count = n()) %>%
mutate(Ratio = Count/sum(Count)) %>%
filter(suspect) %>%
arrange(-Ratio) %>%
filter(Ratio > 0.9)
`summarise()` has grouped output by 'leiden3'. You can override using the `.groups` argument.
obs$nobs <- obs$nobs %>% mutate(suspect2 = case_when(leiden3 %in% sus_stage3_l3$leiden3 ~ TRUE,
suspect ~ TRUE,
!suspect ~ FALSE)) %>%
mutate(suspect = suspect2) %>%
select(-suspect2)
obs$nobs %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = CT, color = CT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
facet_wrap(~suspect)
obs$nobs %>%
filter(!suspect) %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = hrCT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(hrCT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = hrCT, color = hrCT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none")
obs$nobs %>%
filter(!suspect) %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = CT),pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(hrCT_l3, CT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = hrCT_l3,color = CT),
max.overlaps = Inf) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey(), pals::alphabet(), pals::watlington(),
pals::trubetskoy(), pals::tableau20(), pals::stepped(),
pals::polychrome(), pals::okabe()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none")
obs$nobs %>%
filter(!suspect) %>%
ggplot(aes(x=umap1,y=umap2)) +
scattermore::geom_scattermore(aes(color = hrCT), pointsize = 0.8, alpha = 0.5) +
ggrepel::geom_label_repel(data = . %>% group_by(CT, hrCT) %>%
summarise(umap1 = median(umap1),
umap2 = median(umap2)),
aes(label = hrCT, color = hrCT)) +
scale_color_manual(values = c(pals::alphabet2(), pals::glasbey()) %>% unname()) +
cowplot::theme_cowplot() + theme(legend.position = "none") +
facet_wrap(~CT)
mct <- obs$obs %>%
group_by(MajorCellType, study_accession, tissue) %>%
filter(MajorCellType != '', MajorCellType != 'unlabelled', MajorCellType != 'rpc') %>%
summarize(Count = n()) %>%
rename(CT = MajorCellType) %>%
mutate(CT = case_when(CT == 'oligo' ~ 'oligodendrocyte',
CT == 'sphincter' ~ 'muscle',
CT == 'fiber' ~ 'fibroblast',
TRUE ~ CT)) %>%
mutate(origin = case_when(CT %in% c("amacrine",
"bipolar",
"rod",
"cone",
"horizontal",
"retinal ganglion") ~ 'neural',
TRUE ~ 'nonneural')) %>%
mutate(Class = 'Author Major Cell Type')
`summarise()` has grouped output by 'MajorCellType', 'study_accession'. You can override using the `.groups` argument.
mMCT <- obs$nobs %>%
filter(!suspect) %>%
group_by(CT, study_accession, tissue, origin) %>%
summarize(Count = n()) %>%
mutate(Class = 'ML Major Cell Type')
`summarise()` has grouped output by 'CT', 'study_accession', 'tissue'. You can override using the `.groups` argument.
bind_rows(mct,
mMCT) %>%
filter(origin == 'neural') %>%
ggplot(aes(y=CT,x=Count,fill=study_accession)) +
geom_bar(stat='identity') +
ggforce::facet_row(~Class, scales = 'free') +
cowplot::theme_cowplot() +
scale_fill_manual(values = pals::kelly()[2:20])
bind_rows(mct,
mMCT) %>%
filter(origin == 'nonneural') %>%
ggplot(aes(y=CT,x=Count,fill=study_accession)) +
geom_bar(stat='identity') +
ggforce::facet_row(~Class) +
cowplot::theme_cowplot() +
scale_fill_manual(values = pals::kelly()[2:20])
# save(obs, file = 'Human_Mature_BackEye.freeze20241112.obs.Rdata')
# obs$nobs %>% write_csv("Human_Mature_BackEye.freeze20241112.nobs.csv.gz")
# obs$nlabels %>% write_csv("Human_Mature_BackEye.freeze20241112.nlabels.csv.gz")
Run mouse and chick (adult / not fetal) data against the Stage 4 nonneural and neural scANVI models.
Pondering how to best do this….
Parameters used (Stage 2 ran a bunch, in Stage 3 picked this one): 2000hvg 200e 30l
I have three models (on biowulf2):
Three approaches for new data: 1. Run against the Stage 2 model - downsides: - contains a lot of (later) removed cells (or is this an upside?) - built against
sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.7.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggtree_3.12.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
[6] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1
[11] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] fs_1.6.4 matrixStats_1.3.0 spatstat.sparse_3.1-0
[4] httr_1.4.7 RColorBrewer_1.1-3 tools_4.4.1
[7] sctransform_0.4.1 utf8_1.2.4 R6_2.5.1
[10] DT_0.33 lazyeval_0.2.2 uwot_0.2.2
[13] withr_3.0.0 sp_2.1-4 gridExtra_2.3
[16] progressr_0.14.0 cli_3.6.3 Biobase_2.64.0
[19] spatstat.explore_3.3-1 fastDummies_1.7.3 labeling_0.4.3
[22] sass_0.4.9 Seurat_5.1.0 spatstat.data_3.1-2
[25] ggridges_0.5.6 pbapply_1.7-2 yulab.utils_0.1.5
[28] R.utils_2.12.3 dichromat_2.0-0.1 parallelly_1.38.0
[31] maps_3.4.2 limma_3.60.4 rstudioapi_0.16.0
[34] RSQLite_2.3.7 pals_1.9 generics_0.1.3
[37] gridGraphics_0.5-1 ica_1.0-3 spatstat.random_3.3-1
[40] crosstalk_1.2.1 vroom_1.6.5 Matrix_1.7-0
[43] fansi_1.0.6 S4Vectors_0.42.1 abind_1.4-5
[46] R.methodsS3_1.8.2 lifecycle_1.0.4 yaml_2.3.10
[49] edgeR_4.2.1 SummarizedExperiment_1.34.0 SparseArray_1.4.8
[52] Rtsne_0.17 grid_4.4.1 blob_1.2.4
[55] promises_1.3.0 dqrng_0.4.1 crayon_1.5.3
[58] miniUI_0.1.1.1 lattice_0.22-6 beachmat_2.20.0
[61] cowplot_1.1.3 KEGGREST_1.44.1 mapproj_1.2.11
[64] pillar_1.9.0 knitr_1.48 metapod_1.12.0
[67] GenomicRanges_1.56.1 future.apply_1.11.2 codetools_0.2-20
[70] leiden_0.4.3.1 glue_1.7.0 ggfun_0.1.5
[73] spatstat.univar_3.0-0 data.table_1.16.99 vctrs_0.6.5
[76] png_0.1-8 treeio_1.28.0 spam_2.10-0
[79] gtable_0.3.5 cachem_1.1.0 xfun_0.48
[82] S4Arrays_1.4.1 mime_0.12 survival_3.7-0
[85] SingleCellExperiment_1.26.0 statmod_1.5.0 bluster_1.14.0
[88] fitdistrplus_1.2-1 ROCR_1.0-11 nlme_3.1-165
[91] bit64_4.0.5 RcppAnnoy_0.0.22 GenomeInfoDb_1.40.1
[94] bslib_0.8.0 irlba_2.3.5.1 KernSmooth_2.23-24
[97] colorspace_2.1-1 BiocGenerics_0.50.0 DBI_1.2.3
[100] tidyselect_1.2.1 bit_4.0.5 compiler_4.4.1
[103] BiocNeighbors_1.22.0 DelayedArray_0.30.1 plotly_4.10.4
[106] scales_1.3.0 lmtest_0.9-40 digest_0.6.36
[109] goftest_1.2-3 spatstat.utils_3.0-5 rmarkdown_2.27
[112] XVector_0.44.0 htmltools_0.5.8.1 pkgconfig_2.0.3
[115] sparseMatrixStats_1.16.0 MatrixGenerics_1.16.0 fastmap_1.2.0
[118] rlang_1.1.4 htmlwidgets_1.6.4 UCSC.utils_1.0.0
[121] shiny_1.9.0 DelayedMatrixStats_1.26.0 farver_2.1.2
[124] jquerylib_0.1.4 zoo_1.8-12 jsonlite_1.8.8
[127] BiocParallel_1.38.0 R.oo_1.26.0 BiocSingular_1.20.0
[130] magrittr_2.0.3 scuttle_1.14.0 GenomeInfoDbData_1.2.12
[133] ggplotify_0.1.2 dotCall64_1.1-1 patchwork_1.2.0
[136] munsell_0.5.1 Rcpp_1.0.13 ape_5.8
[139] reticulate_1.38.0 stringi_1.8.4 zlibbioc_1.50.0
[142] MASS_7.3-61 plyr_1.8.9 org.Hs.eg.db_3.19.1
[145] parallel_4.4.1 listenv_0.9.1 ggrepel_0.9.5
[148] deldir_2.0-4 Biostrings_2.72.1 splines_4.4.1
[151] tensor_1.5 hms_1.1.3 locfit_1.5-9.10
[154] igraph_2.0.3 spatstat.geom_3.3-2 RcppHNSW_0.6.0
[157] metamoRph_0.2.1 reshape2_1.4.4 stats4_4.4.1
[160] ScaledMatrix_1.12.0 evaluate_0.24.0 SeuratObject_5.0.2
[163] scran_1.32.0 tweenr_2.0.3 tzdb_0.4.0
[166] httpuv_1.6.15 RANN_2.6.1 polyclip_1.10-7
[169] future_1.34.0 scattermore_1.2 ggforce_0.4.2
[172] rsvd_1.0.5 xtable_1.8-4 RSpectra_0.16-2
[175] tidytree_0.4.6 later_1.3.2 viridisLite_0.4.2
[178] aplot_0.2.3 memoise_2.0.1 AnnotationDbi_1.66.0
[181] IRanges_2.38.1 cluster_2.1.6 timechange_0.3.0
[184] globals_0.16.3